since the targert variable is numeric we use linear regression model
An insurance company wants to predict the customer life time value on diffrent qualitative and qauntitaive features provided.
They are operating from last few years and maintaining all transactional information data. The given data ‘CustomerData.csv’ is a sample of customer level data extracted and processed for the analysis from various set of transactional files.
calculate average purchase value
calculate average purchase frequency rate
calculate customer value
calculate average purchase frequency rate
Calculate average customer lifespan
calculate CLTV
A quality management team within an automaker moves with a challenging sphere of tension between customer satisfaction level, regulation and cost control
predictive model can be built to automating customer lifetime value process, hence reducing the unfairness within the clv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
#### Load the required libraries
%matplotlib inline
import os
import inspect
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import SVG
from IPython.display import display
import graphviz
import seaborn
from sklearn.tree import DecisionTreeClassifier
import pandas as pd#(for dataframes)
import numpy as np#(linear algebra)
from sklearn.model_selection import train_test_split#(for spliting the data)
from sklearn.impute import SimpleImputer#(for handling NA's values)
from sklearn.preprocessing import StandardScaler#(it helps in calculation of num data)
#required libraries for visulaization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
sns.set_palette("GnBu_d")
sns.set_style('whitegrid')
import missingno as msno
import pandas_profiling as pp
#There is an indication given in the result that there might exist a strong multicollinearity in the data.
#Lets use variance inflation factor (VIF) to understand if there exist a multicollinearity and remove those attributes
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder
enc=LabelEncoder()
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz,DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
#mlxtend : Machine learning extensions
import random
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import networkx as nx
## import libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
#mlxtend : Machine learning extensions
import random
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import networkx as nx
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
## Extract the rules
from mlxtend.frequent_patterns import association_rules
!pip install mlxtend
!pip install networkx
from sklearn.metrics import accuracy_score, recall_score, precision_score,confusion_matrix,mean_absolute_error,mean_squared_error,classification_report
# since pip install graphviz was unable to load this is the extended process to load the library
! pip install graphviz
! pip install mlxtend
! pip install missingno
! pip install pandas_profiling
1.setting with current working directory with jupyter notebook
# to check which directory my python note book is working
os.getcwd()
os
# chdir is to change my working directory to my current working directory
os.chdir('C:\\Users\\smithika\\Desktop\\final mith')
# loading the data
df= pd.read_csv("train-1574429526318.csv")
# the detail data which is loaded
df
# data.head gives us the head of data to understand the parameters easy
df.head(5)
## to check the type of the data
type(df)
# to check the shape of the data
df.shape
# this is the summary statistic of df
df.describe(include='all')
df.loc[ (df['number.of.open.complaints']==5.000000),]
df.loc[ (df['number.of.policies']==9.000000),]
#converting all the column names to lower case for more convinience..
df.columns = [x.lower().replace(' ','_') for x in df.columns]
## to check the index of the df
df.index
# checking wheather names are been converted or not
df.head()
#writing a function to get the null values and unique values in the dataset
def levels(df):
return (pd.DataFrame({'dtype':df.dtypes,
'levels':df.nunique(),
'levels':[df[x].unique() for x in df.columns],
'null_values':df.isna().sum(),
'unique':df.nunique()}))
levels(df)
df.drop("location.geo",axis=1,inplace=True)
df.drop("customerid",axis=1,inplace=True)
df.head(3)
## to checking the total missing values
df.isna().sum()
# dropping the missing values helps us to plot the numeric attributes easily
df.dropna(axis=0, inplace=True)
# checking if we any duplicate values
df2=df.duplicated(keep='first')
df2
print(df.shape)
## to check the numeric attributes in my df
num_attr = df.select_dtypes(include=['int64', 'float64']).columns
num_attr
## to check my categorical or object is been identified in df
num_cat= df.select_dtypes(include=['object']).columns
num_cat
df[num_attr].corr()
pd.plotting.scatter_matrix (df, figsize=(16, 16), diagonal='kde')
plt.show()
plt.figure(figsize=(8,8))
sns.heatmap(df.corr())
plt.show()
###Plotting Categorical Data
sns.countplot(x="vehicle.class", data=df)
plt.show()
cat=('coverage', 'education', 'employmentstatus', 'gender', 'income',
'location.geo', 'location.code', 'marital.status', 'policy.type',
'policy', 'renew.offer.type', 'sales.channel', 'vehicle.class')
cat
for col in ['coverage','education','employmentstatus','gender','income',
'location.code',
'marital.status',
'policy.type',
'policy',
'renew.offer.type',
'sales.channel',
'vehicle.class']:
df[col] = df[col].astype('category')
df[col]
cols=["coverage", "education","employmentstatus","gender","income",
"marital.status",
"policy.type",
"policy",
"renew.offer.type",
"sales.channel",
"vehicle.class"]
df=pd.get_dummies(columns=cols,data=df,drop_first=True)
df.head(3)
df.isna().sum()
df.dropna(axis=0, inplace=True)
df.isna().sum()
df.dtypes
# defining the numerical attributes with value counts
df['vehicle.class'].value_counts()
df['vehicle.size'].value_counts()
sns.countplot(x="vehicle.size", data=df)
plt.show()
df['renew.offer.type'].value_counts()
df['sales.channel'].value_counts()
df['policy'].value_counts()
df['employmentstatus'].value_counts()
df['policy.type'].value_counts()
sns.boxplot(x="vehicle.class", y="customer.lifetime.value", data=df, palette="PRGn")
plt.show()
sns.boxplot(x="employmentstatus", y="customer.lifetime.value", data=df, palette="PRGn")
plt.show()
# Plotting Categorical Data
sns.countplot(x="customer.lifetime.value", data=df)
plt.show()
# Make default histogram of FrequencyOFPlay
sns.distplot(df["total.claim.amount"] )
# Bins can be changes
plt.show()
#polt histogram of FrequencyOFPlay
sns.distplot(df["customer.lifetime.value"] )
# Bins can be changes
plt.show()
df['customer.lifetime.value'].value_counts()
# sns plot graph
sns.lmplot(x='total.claim.amount', y='customer.lifetime.value', data=df)
plt.show()
#Performing train test split on the data
X, y = df.loc[:,df.columns!='customer.lifetime.value'], df.loc[:,'customer.lifetime.value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
#To get the distribution in the target in train and test
print(pd.value_counts(y_train))
print(pd.value_counts(y_test))
# loading the train model
train_data = pd.read_csv("train-1574429526318.csv")
train_data.head()
# understanding the shape of the data
train_data.shape
## lets us understand the diffrent levels of train data
def understanding_data(data):
return pd.DataFrame({"Data Type":data.dtypes,"No of Levels":data.apply(lambda x: x.nunique(),axis=0), "Levels":data.apply(lambda x: str(x.unique()),axis=0)})
understanding_data(train_data)
#summary statistics of train data
train_data.describe()
pp.ProfileReport(train_data)
temp=train_data.corr()
fig = plt.figure(figsize=(10,8))
sns.heatmap(temp,annot=True)
# frequency distribution of the data
train_data.hist(bins=50, figsize=(20,20))
plt.show()
# checking for missing data
train_data.isnull().sum()
#plot for missing data in matrix form
msno.matrix(train_data)
train_data.dtypes
#data type conversion
num_cols = ['customer.lifetime.value','monthly.premium.auto','number.of.open.complaints','number.of.policies','total.claim.amount','vehicle.size']
cat_cols = train_data.columns.difference(num_cols)
cat_cols
#data type conversion
num_cols = ['Customer.Lifetime.Value','Monthly.Premium.Auto','Number.of.Open.Complaints','Number.of.Policies','Total.Claim.Amount','Vehicle.Size']
cat_cols = train_data.columns.difference(num_cols)
cat_cols
cat_cols
num_data = train_data.loc[:,num_cols]
cat_data = train_data.loc[:,cat_cols]
train_data[cat_cols] = train_data[cat_cols].apply(lambda x: x.astype('category'))
train_data[num_cols] = train_data[num_cols].apply(lambda x: x.astype('float'))
train_data.dtypes
# Numeric columns imputation
imp = SimpleImputer(missing_values=np.nan, strategy='median')
num_data = pd.DataFrame(imp.fit_transform(num_data),columns=num_cols)
# Categorical columns imputation
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_data = pd.DataFrame(imp.fit_transform(cat_data),columns=cat_cols)
print(num_data.isnull().sum())
print(cat_data.isnull().sum())
train_data.drop(['CustomerID'], axis=1, inplace=True)
standardizer = StandardScaler()
standardizer.fit(num_data)
num_data = pd.DataFrame(standardizer.transform(num_data),columns=num_cols)
train_data = pd.concat([num_data,cat_data],axis=1)
train_data = pd.get_dummies(train_data,columns=cat_cols,drop_first=True)
train_data.head()
x = train_data.copy().drop("Customer.Lifetime.Value",axis=1)
y = train_data["Customer.Lifetime.Value"]
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.30,random_state=1)
print(train_data.shape)
print(x_train.shape)
print(y_train.shape)
### Simple Linear Regression using Statsmodels package
##To get all except the target ant target
x.columns
print(x.columns.values[-1])
print(x.columns.values[:-1])
dtr = DecisionTreeRegressor()
dtr.fit(x_train,y_train)
## Get the predictions on train and test
pred_train = dtr.predict(x_train)
pred_val = dtr.predict(x_validation)
pred_train
pred_val
dtr = DecisionTreeRegressor(max_depth=2)
dtr.fit(x_train,y_train)
pred_train = dtr.predict(x_train)
print("Train Error:", mean_absolute_error(y_train,pred_train))
test_df= pd.read_csv("test-1574429501088.csv")
test_df.head(3)
test_df.describe()
from sklearn.model_selection import GridSearchCV
def model_building(model, params = None, k = 1) :
if params == None :
model.fit(X_train, y_train)
# return fitted model & train-test predictions
return (model, model.predict(X_train), model.predict(X_test))
else :
model_cv = GridSearchCV(model, param_grid = params, cv = k)
model_cv.fit(X_train, y_train)
model = model_cv.best_estimator_
# return and extra object for all cross validation operations
return (model_cv, model, model.predict(X_train), model.predict(X_test))
### Load the required libraries
%matplotlib inline
import inspect
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import SVG
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
num_imputer = SimpleImputer()
num_scaler = StandardScaler()
import seaborn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz,DecisionTreeRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score,confusion_matrix,mean_absolute_error,mean_squared_error,classification_report
X_actual_test = test_df
num_imputer.fit(X_actual_test)
num_scaler.fit(X_actual_test)
X_actual_test = num_imputer.transform(X_actual_test)
X_actual_test = num_scaler.transform(X_actual_test)
X_actual_test = pd.DataFrame(X_actual_test)
X_actual_test.isna().sum()
actual_test_pred = clf.predict(X_actual_test)
actual_test_pred
new_pred = pd.DataFrame(actual_test_pred)
sample= pd.read_csv('sample_submission-1577482703002.csv')
sample.customerLifetime.Value.head()
sample.target=actual_test_pred